# load packages
library(tidyverse)
library(corrr)
library(plotly)
gapminder_clean.csv data as a tibble using read_csv.# load in gapminder dataset
gapminder <- read_csv("gapminder_clean.csv") %>% column_to_rownames(., var = 'X1')
## Warning: Missing column names filled in: 'X1' [1]
head(gapminder)
## Country Name Year Agriculture, value added (% of GDP)
## 0 Afghanistan 1962 NA
## 1 Afghanistan 1967 NA
## 2 Afghanistan 1972 NA
## 3 Afghanistan 1977 NA
## 4 Afghanistan 1982 NA
## 5 Afghanistan 1987 NA
## CO2 emissions (metric tons per capita)
## 0 0.07378134
## 1 0.12378238
## 2 0.13082014
## 3 0.18311831
## 4 0.16587912
## 5 0.27556031
## Domestic credit provided by financial sector (% of GDP)
## 0 21.276422
## 1 9.917662
## 2 18.880833
## 3 13.836822
## 4 NA
## 5 NA
## Electric power consumption (kWh per capita)
## 0 NA
## 1 NA
## 2 NA
## 3 NA
## 4 NA
## 5 NA
## Energy use (kg of oil equivalent per capita)
## 0 NA
## 1 NA
## 2 NA
## 3 NA
## 4 NA
## 5 NA
## Exports of goods and services (% of GDP)
## 0 4.878051
## 1 6.772908
## 2 14.763231
## 3 11.662904
## 4 NA
## 5 NA
## Fertility rate, total (births per woman) GDP growth (annual %)
## 0 7.450 NA
## 1 7.450 NA
## 2 7.450 NA
## 3 7.449 NA
## 4 7.450 NA
## 5 7.461 NA
## Imports of goods and services (% of GDP) Industry, value added (% of GDP)
## 0 9.349593 NA
## 1 14.209827 NA
## 2 18.105850 NA
## 3 14.823175 NA
## 4 NA NA
## 5 NA NA
## Inflation, GDP deflator (annual %) Life expectancy at birth, total (years)
## 0 NA 33.21990
## 1 NA 35.38941
## 2 NA 37.61015
## 3 NA 40.11015
## 4 NA 43.23073
## 5 NA 47.29634
## Population density (people per sq. km of land area)
## 0 14.31206
## 1 15.88181
## 2 17.94703
## 3 19.99893
## 4 19.40232
## 5 17.36656
## Services, etc., value added (% of GDP) pop continent gdpPercap
## 0 NA 10267083 Asia 853.1007
## 1 NA 11537966 Asia 836.1971
## 2 NA 13079460 Asia 739.9811
## 3 NA 14880372 Asia 786.1134
## 4 NA 12881816 Asia 978.0114
## 5 NA 13867957 Asia 852.3959
Year is 1962 and then make a scatter plot comparing 'CO2 emissions (metric tons per capita)' and gdpPercap for the filtered data.# filter only Year 1962
gapminder_1962 <- gapminder %>% filter(Year == "1962")
# plot gdp v CO2
gapminder_1962 %>% ggplot(aes(gdpPercap,`CO2 emissions (metric tons per capita)`)) + geom_point() + labs(title = "CO2 emissions by gdpPercap,continent, population size")
## Warning: Removed 151 rows containing missing values (geom_point).
'CO2 emissions (metric tons per capita)' and gdpPercap. What is the correlation and associated p value?gdp.co2.cor <- cor.test(x=gapminder_1962$gdpPercap,y=gapminder_1962$`CO2 emissions (metric tons per capita)`)
# correlation val
gdp.co2.cor$estimate
## cor
## 0.9260817
# correlation p val
gdp.co2.cor$p.value
## [1] 1.128679e-46
'CO2 emissions (metric tons per capita)' and gdpPercap the strongest?” Filter the dataset to that year for the next step…gapminder %>% group_by(Year) %>% summarize(correlation=cor(`CO2 emissions (metric tons per capita)`,gdpPercap,use = "na.or.complete")) %>% subset(correlation == max(correlation))
## # A tibble: 1 x 2
## Year correlation
## <dbl> <dbl>
## 1 1967 0.939
gapminder_1967 <- gapminder %>% filter(Year=="1967")
plotly, create an interactive scatter plot comparing 'CO2 emissions (metric tons per capita)' and gdpPercap, where the point size is determined by pop (population) and the color is determined by the continent. You can easily convert any ggplot plot to a plotly plot using the ggplotly() command.plt <- gapminder_1967 %>% ggplot(aes(gdpPercap,`CO2 emissions (metric tons per capita)`, color = continent, size = pop)) + geom_point() + labs(title = "CO2 emissions by gdpPercap,continent, population size", color = "continent", size = "")
ggplotly(plt)
Now, without further guidance, use your R Data Science skills (and appropriate statistical tests) to answer the following:
continent and 'Energy use (kg of oil equivalent per capita)'? (stats test needed)The Energy usage between continent is significantly different (p = 2e-16 < 0.05).
gapminder_contE <- gapminder %>% select(continent, `Energy use (kg of oil equivalent per capita)`) %>% drop_na()
contE_aov <- aov(`Energy use (kg of oil equivalent per capita)` ~ continent, data = gapminder_contE)
summary(contE_aov)
## Df Sum Sq Mean Sq F value Pr(>F)
## continent 4 7.715e+08 192870621 51.46 <2e-16 ***
## Residuals 843 3.160e+09 3748033
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
'Imports of goods and services (% of GDP)' in the years after 1990? (stats test needed)There is no significant difference (p = 0.1776 > 0.05) between Europe and Asia’s Import of goods and services (% of GDP) after 1990.
gapminder_1990 <- gapminder %>% filter(Year > "1990" & continent %in% c("Asia", "Europe")) %>% select(continent, `Imports of goods and services (% of GDP)`) %>% drop_na()
t.test(`Imports of goods and services (% of GDP)` ~ continent, data = gapminder_1990)
##
## Welch Two Sample t-test
##
## data: Imports of goods and services (% of GDP) by continent
## t = 1.3552, df = 137.53, p-value = 0.1776
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -2.321099 12.433240
## sample estimates:
## mean in group Asia mean in group Europe
## 46.84531 41.78924
'Population density (people per sq. km of land area)' across all years? (i.e., which country has the highest average ranking in this category across each time point in the dataset?)gapminder %>% group_by(`Country Name`) %>% summarize(mean_popdensity = mean(`Population density (people per sq. km of land area)`)) %>% drop_na() %>% subset(mean_popdensity == max(mean_popdensity))
## # A tibble: 1 x 2
## `Country Name` mean_popdensity
## <chr> <dbl>
## 1 Macao SAR, China 14732.
'Life expectancy at birth, total (years)' since 1962?gapminder %>% select(`Country Name`, Year, `Life expectancy at birth, total (years)`) %>% group_by(`Country Name`) %>% pivot_wider(names_from = Year, values_from = `Life expectancy at birth, total (years)`) %>% select(`Country Name`, `1962`, `2007`) %>% mutate(change_lifeexp = `2007` - `1962`) %>% drop_na() %>% subset(change_lifeexp == max(change_lifeexp))
## # A tibble: 1 x 4
## # Groups: Country Name [1]
## `Country Name` `1962` `2007` change_lifeexp
## <chr> <dbl> <dbl> <dbl>
## 1 Maldives 38.5 75.4 36.9